This notebook documents the steps taken to match municipality-level records from Clarke et al. (2024) (OpenDengue dengue incidence database, https://opendengue.org/data.html, accessed June 12, 2024) and Siraj et al. (2018) (municipality-level processed environmental data, Dryad: https://doi.org/10.5061/dryad.83nj1, accessed Feb 20, 2024).
The datasets use slightly different naming conventions and identifiers for Colombian municipalities and departments. We matched records based on municipality and department names, using manual review and spatial boundary checks with shape files in cases of minor discrepancies—following an approach similar to Clarke et al. (2024).
A shapefile is a standard geospatial format that stores the geometry and attributes of geographic features—in this case, the administrative boundaries of Colombian municipalities. These files are large and are therefore not included in this repository.
The purpose of this notebook is to document the linkage process clearly and transparently. For instructions on obtaining the required shapefiles, please refer to the accompanying preprint.
rm(list = ls())
knitr::opts_chunk$set(echo = TRUE)
WD <- "./" # Set your working directory here
setwd(WD)
# Load required libraries
library(sf) # Spatial data (shapefiles)
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(stringi) # String standardization
library(stringdist) # String distance metrics
library(plyr) # Data frame manipulations
library(dplyr) # Data frame manipulations
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # Visualization
# Visualize overlapping polygons for a given FAO GAUL code
#
# This function compares how a single FAO GAUL code is represented
# across two shapefiles (OpenDengue and OCHA) by plotting their
# corresponding polygons
#
# Arguments:
# - gaul_code: Numeric. FAO GAUL code to visualize.
# - shapefile_od: sf object. OpenDengue shapefile (admin-2 level).
# - shapefile_ocha: sf object. OCHA shapefile (admin-2 level).
# - data_od: Data frame. OpenDengue incidence data with columns
# 'adm_1_name' (department) and 'adm_2_name' (municipality).
#
# Returns:
# A ggplot object showing overlapping administrative boundaries
# for the given GAUL code:
# - Pink: OpenDengue shapefile polygons
# - Blue: OCHA shapefile polygons
#
# Example:
# plot_gaul_overlap(13521, shapefile_od = openDengue_shape,
# shapefile_ocha = ocha_shape,
# data_od = dengue_incidence)
plot_gaul_overlap <- function(gaul_code, shapefile_od, shapefile_ocha, data_od) {
# --- Standardize OCHA names for matching ---
shapefile_ocha$department_name <- toupper(
stri_trans_general(shapefile_ocha$ADM1_ES, "latin-ascii")
)
shapefile_ocha$municipality_name <- toupper(
stri_trans_general(shapefile_ocha$ADM2_ES, "latin-ascii")
)
# --- Subset OpenDengue shapefile by GAUL code ---
od_polygon <- shapefile_od[shapefile_od$GAUL_CODE == gaul_code, ]
# --- Identify municipalities linked to this GAUL code in the dengue data ---
linked_municipalities <- data_od[data_od$FAO_GAUL_code == gaul_code,
c("adm_1_name", "adm_2_name")]
linked_municipalities <- linked_municipalities[!duplicated(linked_municipalities), ]
# --- Subset OCHA shapefile by matching names ---
ocha_polygons <- subset(
shapefile_ocha,
municipality_name %in% toupper(linked_municipalities$adm_2_name) &
department_name %in% toupper(linked_municipalities$adm_1_name)
)
# --- Ensure both are sf objects ---
od_polygon <- st_as_sf(od_polygon)
ocha_polygons <- st_as_sf(ocha_polygons)
# --- Build visualization ---
ggplot() +
geom_sf(data = ocha_polygons, color = "darkblue", fill = NA, linewidth = 1) +
geom_sf(data = od_polygon, color = "deeppink2", fill = NA, linewidth = 1) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
ggtitle(paste("FAO GAUL Code:", gaul_code))
}
# Visualize a single OpenDengue municipality vs OCHA polygon
#
# This function compares the geographic boundaries of one OpenDengue
# municipality (by GAUL code) with the corresponding OCHA polygon
# (by OCHA municipality ID)
#
# Arguments:
# - od_gaul_code: Numeric. GAUL code of the OpenDengue municipality.
# - ocha_munic_id: OCHA municipality ID (string).
# - od_shape: sf object. OpenDengue shapefile (admin-2 level).
# - ocha_shape: sf object. OCHA shapefile (admin-2 level).
#
# Returns:
# - A ggplot object showing overlapping polygons:
# * Pink = OpenDengue
# * Blue = OCHA
plot_gaul_byIDs <- function(od_gaul_code, ocha_munic_id, od_shape, ocha_shape) {
# Subset OpenDengue polygon by GAUL code
od_polygon <- od_shape[od_shape$GAUL_CODE == od_gaul_code, ]
# Subset OCHA polygon by municipality ID
ocha_polygon <- ocha_shape[ocha_shape$ADM2_PCODE == ocha_munic_id, ]
# Convert to sf objects for plotting
od_polygon <- st_as_sf(od_polygon)
ocha_polygon <- st_as_sf(ocha_polygon)
# Build plot
ggplot() +
geom_sf(data = ocha_polygon, color = "darkblue", fill = NA, linewidth = 1) +
geom_sf(data = od_polygon, color = "deeppink2", fill = NA, linewidth = 1) +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
ggtitle(paste0(
ocha_polygon$adm1_std, ": ", ocha_polygon$adm2_std
))
}
#Given a single dengue incidence row, this function identifies the best matching polygon(s) in the OCHA shapefile by comparing standardized municipality and department names using string distances. It returns the candidate polygon(s) along with the dengue ID for traceability.
link_municipality_to_ocha <- function(dengue_row, od_shape, ocha_shape) {
# Compute string distances for municipality names
ocha_shape$municipality_dist <- sapply(seq_len(nrow(ocha_shape)), function(j) {
stringdist(dengue_row$adm2_std, ocha_shape$adm2_std[j])
})
# Keep only OCHA polygons with minimal municipality distance
candidate_polygons <- ocha_shape[ocha_shape$municipality_dist == min(ocha_shape$municipality_dist), ]
# Compute string distances for department names among remaining candidates
candidate_polygons$department_dist <- sapply(seq_len(nrow(candidate_polygons)), function(j) {
stringdist(dengue_row$adm1_std, candidate_polygons$adm1_std[j])
})
# Keep only candidates with minimal department distance
candidate_polygons <- candidate_polygons[candidate_polygons$department_dist == min(candidate_polygons$department_dist), ]
# Add dengue ID for traceability
candidate_polygons$dengue_ID <- dengue_row$dengue_ID
return(candidate_polygons)
}
# Path to data from Siraj et al. (2018)
PATH_SIRAJ <- "../data/empirical/Siraj_et_al_2018/municip_aggregate_non_ts.csv"
# Path to dengue incidence data from Clarke et al. (2024)
PATH_OD <- "../data/empirical/Clarke_et_al_2024/Highest temporal resolution data_COLOMBIA_20021229_20191231.csv"
# Path to OCHA shapefile (https://data.humdata.org/dataset/cod-ab-col, accessed Feb 22, 2024)
PATH_OCHA_SHAPE <- "/Users/anna/Desktop/DD-Material/shapefiles/col-administrative-divisions-shapefiles/col_admbnda_adm2_mgn_20200416.shp"
# Path to OpenDengue shapefile (provided by Oliver Brady, Mar 19, 2024)
PATH_OPENDENGUE_SHAPE <- "/Users/anna/Desktop/DD-Material/shapefiles/OpenDengue_shapefile/Admin2(2011)_COL.shp"
# Read Siraj sample data
env_data <- read.csv(PATH_SIRAJ, header = TRUE)
# Read OpenDengue incidence data
dengue_incidence <- read.csv(PATH_OD, header = TRUE)
dengue_incidence <- dengue_incidence[dengue_incidence$T_res == "Week", ] # Keep only weekly data
# Read shapefiles
ocha_shape <- st_read(PATH_OCHA_SHAPE)
## Reading layer `col_admbnda_adm2_mgn_20200416' from data source
## `/Users/anna/Desktop/DD-Material/shapefiles/col-administrative-divisions-shapefiles/col_admbnda_adm2_mgn_20200416.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1122 features and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -81.73562 ymin: -4.229406 xmax: -66.84722 ymax: 13.39473
## Geodetic CRS: WGS 84
openDengue_shape <- st_read(PATH_OPENDENGUE_SHAPE)
## Reading layer `Admin2(2011)_COL' from data source
## `/Users/anna/Desktop/DD-Material/shapefiles/OpenDengue_shapefile/Admin2(2011)_COL.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 1087 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -81.72811 ymin: -4.230484 xmax: -66.86983 ymax: 13.39029
## Geodetic CRS: WGS 84
To verify data consistency, we perform two levels of checks:
ID matching: We confirm that municipality identifiers (numeric codes) from each dataset are found in the corresponding shapefile.
Name consistency: Because we had to rely on shapefiles that were released after the publication of the Siraj data, we checked whether the municipality names in Siraj and OCHA are consistent. To do this, we standardize both name sets (lowercase, remove accents) and compute a string distance score between matched IDs. A score of 0 means identical names. Most municipalities show excellent agreement, with minor differences due to missing diacritics or additional descriptors.
# Prepare and compare IDs
ocha_id <- gsub("CO", "", ocha_shape$ADM2_PCODE)
ocha_id <- as.integer(unique(ocha_id))
env_id <- as.integer(unique(env_data$ID_ESPACIA))
# Identify Siraj IDs not present in OCHA
table(env_id %in% ocha_id)
##
## FALSE TRUE
## 1 1122
absent_id <- setdiff(env_id, ocha_id)
env_data[env_data$ID_ESPACIA == absent_id, ] # Not present = "REA EN LITIGIO"
## ID_ESPACIA NOM_MUNICI Wpop2015 WpopBirths2015 UrbanPop MeanGCP_2005USD
## 376 99999 REA EN LITIGIO 3994.193 37.40853 0 3840.21
## MeanTraveltime
## 376 265.7429
# Confirm absence in incidence data
table(dengue_incidence$adm_2_name[grep("REA", dengue_incidence$adm_2_name)])
##
## ASTREA
## 166
# Check OpenDengue IDs
dengue_id <- as.integer(unique(dengue_incidence$FAO_GAUL_code))
od_id <- as.integer(unique(openDengue_shape$GAUL_CODE))
table(dengue_id %in% od_id)
##
## TRUE
## 1007
rm(absent_id, dengue_id, ocha_id, od_id, env_id)
# Standardize Siraj et al. names
env_names <- subset(env_data, select = c("ID_ESPACIA", "NOM_MUNICI"))
colnames(env_names) <- c("ID", "env_name")
env_names$env_name_std <- tolower(stri_trans_general(env_names$env_name, "latin-ascii"))
# Standardize OCHA names
ocha_names <- data.frame("ID" = ocha_shape$ADM2_PCODE, "ocha_name" = ocha_shape$ADM2_ES)
ocha_names$ID <- as.integer(gsub("CO", "", ocha_names$ID))
ocha_names$ocha_name_std <- tolower(stri_trans_general(ocha_names$ocha_name, "latin-ascii"))
# Merge by ID
matched_names <- merge(env_names, ocha_names, by = "ID")
# Calculate distance in name strings
matched_names$score <- stringdist(matched_names$env_name_std, matched_names$ocha_name_std)
table(matched_names$score) # Majority show minimal differences
##
## 0 1 2 3 4 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 695 327 15 2 2 1 4 1 6 8 7 9 10 3 2 2 1 1 3 2
## 21 22 23 29 31 34 35 38 40
## 11 2 1 1 2 1 1 1 1
matched_names <- matched_names[order(matched_names$score, decreasing = TRUE),]
# Inspect higher mismatches
#print(matched_names[which(matched_names$score > 1), c("env_name_std", "ocha_name_std")])
# Example: Municipality with larger mismatch score (ID 88564)
ocha_shape[which(ocha_shape$ADM2_PCODE == "CO88564"), ]
## Simple feature collection with 1 feature and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -81.39648 ymin: 13.32034 xmax: -81.3491 ymax: 13.39473
## Geodetic CRS: WGS 84
## Shape_Leng Shape_Area ADM2_ES ADM2_PCODE ADM2_REF ADM2ALT1ES
## 705 0.2982922 0.001830813 Providencia CO88564 <NA> <NA>
## ADM2ALT2ES ADM1_ES
## 705 <NA> Archipiélago de San Andrés, Providencia y Santa Catalina
## ADM1_PCODE ADM0_ES ADM0_PCODE date validOn validTo
## 705 CO88 Colombia CO 2018-01-01 2020-04-16 <NA>
## geometry
## 705 MULTIPOLYGON (((-81.36434 1...
env_data[which(env_data$ID_ESPACIA == 88564), ]
## ID_ESPACIA NOM_MUNICI Wpop2015 WpopBirths2015 UrbanPop MeanGCP_2005USD
## 1123 88564 SANTA CATALINA 5476.447 5.991263 0 NA
## MeanTraveltime
## 1123 595.5499
rm(env_names, ocha_names, matched_names)
Integrating the data from Siraj et al. (2018) with the dengue incidence data from the OpenDengue database (Clarke et al., 2024) requires careful consideration due to several structural inconsistencies between datasets.
Non-unique municipality names: Multiple municipalities (admin-2 level) may share the same name but belong to different departments (admin-1 level).
Missing diacritics in Siraj data: Municipality names in the Siraj et al. dataset omit special characters, complicating direct text matching.
Non-unique FAO GAUL codes: A small number of FAO GAUL codes correspond to multiple municipalities (2–4). This one-to-many mapping can cause ambiguity when merging data by GAUL code alone.
To illustrate the last issue, we visualize several FAO GAUL codes using the two available shapefiles:
# Extract unique combinations of GAUL code, municipality, and department
fao_gaul_lookup <- subset(
dengue_incidence,
select = c("FAO_GAUL_code", "full_name", "adm_2_name")
)
fao_gaul_lookup <- fao_gaul_lookup[!duplicated(fao_gaul_lookup$full_name), ]
# Print summary
cat("Number of unique municipalities:", nrow(fao_gaul_lookup), "\n")
## Number of unique municipalities: 1063
cat("Number of unique FAO GAUL codes:", length(unique(fao_gaul_lookup$FAO_GAUL_code)), "\n")
## Number of unique FAO GAUL codes: 1007
# Visualize examples of GAUL code overlaps
plot_gaul_overlap(
gaul_code = 13521,
shapefile_od = openDengue_shape,
shapefile_ocha = ocha_shape,
data_od = dengue_incidence
)
plot_gaul_overlap(
gaul_code = 13441,
shapefile_od = openDengue_shape,
shapefile_ocha = ocha_shape,
data_od = dengue_incidence
)
plot_gaul_overlap(
gaul_code = 13516,
shapefile_od = openDengue_shape,
shapefile_ocha = ocha_shape,
data_od = dengue_incidence
)
rm(fao_gaul_lookup)
We aim to link municipalities in the OpenDengue incidence data to their corresponding polygon names in the OCHA shapefile. We do not use the Siraj et al. data directly here because it lacks special characters (accents and diacritics), which would make exact name matching unreliable.
While geographic features alone are not sufficient for automatic linkage (see previous GAUL code visualizations), they are useful for quality control when municipality or department names do not match perfectly.
The approach used here:
Prepare a clean dengue dataset: remove duplicates and standardize municipality and department names to lowercase ASCII.
Standardize OCHA shapefile names similarly.
Link each dengue municipality to candidate OCHA polygons using string distance:
Trace which incidence record each polygon corresponds to.
This method allows minor discrepancies (e.g., missing accents or extra whitespace) to be handled automatically, while flagging ambiguous matches for manual inspection.
# Prepare dengue municipalities for linking
dengue_munic_link <- dengue_incidence[, c("adm_1_name", "adm_2_name", "full_name", "FAO_GAUL_code")]
dengue_munic_link <- dengue_munic_link[!duplicated(dengue_munic_link$full_name), ]
dengue_munic_link$dengue_ID <- seq_len(nrow(dengue_munic_link))
# Standardize names for matching
dengue_munic_link$adm2_std <- tolower(stri_trans_general(dengue_munic_link$adm_2_name, "latin-ascii"))
dengue_munic_link$adm1_std <- tolower(stri_trans_general(dengue_munic_link$adm_1_name, "latin-ascii"))
ocha_shape$adm2_std <- tolower(stri_trans_general(ocha_shape$ADM2_ES, "latin-ascii"))
ocha_shape$adm1_std <- tolower(stri_trans_general(ocha_shape$ADM1_ES, "latin-ascii"))
# Apply linking function for all dengue municipalities
link_results_list <- lapply(seq_len(nrow(dengue_munic_link)), function(i) {
link_municipality_to_ocha(
dengue_row = dengue_munic_link[i, ],
od_shape = openDengue_shape,
ocha_shape = ocha_shape
)
})
# Combine results into a single data frame
link_results <- do.call(rbind, link_results_list)
In this first step, we identify cases where both the standardized municipality name and department name match perfectly between the OpenDengue data and the OCHA shapefile. These are the “perfect matches,” which can be linked automatically.
# STEP 0: Initialize tracking variables ----
remaining_dengue_IDs <- unique(dengue_munic_link$dengue_ID) # IDs still to be linked
remaining_candidates <- link_results # Candidate matches from previous linking step
# STEP 1: Perfect matches ----
perfect_match_idx <- which(
remaining_candidates$municipality_dist == 0 &
remaining_candidates$department_dist == 0
)
# Sanity check: each dengue_ID should have a unique perfect match
stopifnot(length(unique(remaining_candidates$dengue_ID[perfect_match_idx])) ==
nrow(remaining_candidates[perfect_match_idx, ]))
# Extract final perfect matches
final_link_perfect <- remaining_candidates[perfect_match_idx, ]
# Remove linked municipalities from remaining tracking variables
remaining_dengue_IDs <- setdiff(remaining_dengue_IDs, final_link_perfect$dengue_ID)
remaining_candidates <- remaining_candidates[!remaining_candidates$dengue_ID %in% final_link_perfect$dengue_ID, ]
# Inspect remaining string distance scores (for manual review or next steps)
table(remaining_candidates$municipality_dist, remaining_candidates$department_dist)
##
## 0 3 4 5 6 7 8 9 10 15
## 0 0 55 4 1 1 3 3 1 33 0
## 1 11 0 0 0 0 0 0 0 0 0
## 2 3 2 3 0 5 0 0 0 0 0
## 3 6 1 1 0 1 1 0 0 0 0
## 4 3 0 0 1 2 1 1 1 0 0
## 7 0 0 0 1 0 0 0 0 0 0
## 8 2 0 0 2 1 0 0 0 0 0
## 9 2 0 2 0 1 2 0 0 0 0
## 10 2 0 0 0 0 0 1 0 0 0
## 11 2 0 0 3 1 0 0 0 0 1
## 12 1 0 0 0 1 0 0 1 0 0
## 13 3 0 1 1 0 0 0 0 0 0
## 14 2 0 0 0 0 0 0 0 0 0
## 15 2 0 0 0 0 0 0 0 0 0
## 16 1 0 0 0 0 0 0 0 0 0
## 17 1 0 0 0 0 0 0 0 0 0
In this step, we handle cases where the municipality name matches perfectly, but the department name differs slightly.
Many of these differences are minor spelling or abbreviation issues, such as:
norte de santander (OCHA) vs
norte santander (OpenDengue)valle del cauca (OCHA) vs valle
(OpenDengue)la guajira (OCHA) vs guajira
(OpenDengue)Visual inspection of the corresponding polygons confirms that these are indeed correct matches.
# STEP 2: Perfect municipality match, but department differs ----
dept_mismatch_idx <- which(
remaining_candidates$municipality_dist == 0 & remaining_candidates$department_dist > 0
)
# Inspect which department names are affected
table(remaining_candidates$adm1_std[dept_mismatch_idx])
##
## atlantico boyaca cauca la guajira
## 1 1 3 15
## meta narino norte de santander risaralda
## 1 1 39 1
## santander sucre valle del cauca
## 3 3 33
# Optional: visualize some key examples to confirm geographic match
#plot_gaul_byIDs(od_gaul_code = 14133, ocha_munic_id = "CO54206", od_shape = openDengue_shape, ocha_shape = ocha_shape)
#plot_gaul_byIDs(od_gaul_code = 13964, ocha_munic_id = "CO44090", od_shape = openDengue_shape, ocha_shape = ocha_shape)
#plot_gaul_byIDs(od_gaul_code = 14378, ocha_munic_id = "CO76233", od_shape = openDengue_shape, ocha_shape = ocha_shape)
# STEP 2a: Specific departments with minor differences ----
dept_corrections <- c("norte de santander", "valle del cauca", "la guajira")
correctable_idx <- which(
remaining_candidates$municipality_dist == 0 &
remaining_candidates$adm1_std %in% dept_corrections
)
# Sanity checks
# Only municipality distance is zero
table(remaining_candidates$municipality_dist[correctable_idx],
remaining_candidates$department_dist[correctable_idx])
##
## 3 10
## 0 54 33
# Verify department distance is consistent for these cases
table(remaining_candidates$department_dist[correctable_idx],
remaining_candidates$adm1_std[correctable_idx])
##
## la guajira norte de santander valle del cauca
## 3 15 39 0
## 10 0 0 33
# Each dengue ID should appear only once
stopifnot(length(unique(remaining_candidates$dengue_ID[correctable_idx])) == length(correctable_idx))
# Add these corrected links to the final set
final_link_perfect <- rbind(final_link_perfect, remaining_candidates[correctable_idx, ])
# Remove linked municipalities from remaining tracking variables
remaining_candidates <- remaining_candidates[!remaining_candidates$dengue_ID %in% final_link_perfect$dengue_ID, ]
remaining_dengue_IDs <- setdiff(remaining_dengue_IDs, final_link_perfect$dengue_ID)
# Inspect remaining string distance scores
table(remaining_candidates$municipality_dist, remaining_candidates$department_dist)
##
## 0 3 4 5 6 7 8 9 15
## 0 0 1 4 1 1 3 3 1 0
## 1 11 0 0 0 0 0 0 0 0
## 2 3 2 3 0 5 0 0 0 0
## 3 6 1 1 0 1 1 0 0 0
## 4 3 0 0 1 2 1 1 1 0
## 7 0 0 0 1 0 0 0 0 0
## 8 2 0 0 2 1 0 0 0 0
## 9 2 0 2 0 1 2 0 0 0
## 10 2 0 0 0 0 0 1 0 0
## 11 2 0 0 3 1 0 0 0 1
## 12 1 0 0 0 1 0 0 1 0
## 13 3 0 1 1 0 0 0 0 0
## 14 2 0 0 0 0 0 0 0 0
## 15 2 0 0 0 0 0 0 0 0
## 16 1 0 0 0 0 0 0 0 0
## 17 1 0 0 0 0 0 0 0 0
# Count remaining cases with perfect municipality but differing department
sum(remaining_candidates$municipality_dist == 0 & remaining_candidates$department_dist > 0)
## [1] 14
After handling the three department name differences
(norte de santander, valle del cauca,
la guajira), there are 14 remaining municipalities with a
perfect municipality match but mismatched department names. These appear
dubious and will not be included in the final linkage.
remaining_dept_mismatch <- subset(
remaining_candidates,
municipality_dist == 0 & department_dist > 0
)
# Create a concise summary table for inspection
remaining_summary <- data.frame(
dengue_ID = remaining_dept_mismatch$dengue_ID,
od_municipality = dengue_munic_link$adm2_std[match(remaining_dept_mismatch$dengue_ID, dengue_munic_link$dengue_ID)],
od_department = dengue_munic_link$adm1_std[match(remaining_dept_mismatch$dengue_ID, dengue_munic_link$dengue_ID)],
ocha_municipality = remaining_dept_mismatch$adm2_std,
ocha_department = remaining_dept_mismatch$adm1_std
)
# Print summary of remaining 14 dubious matches
print(remaining_summary)
## dengue_ID od_municipality od_department ocha_municipality ocha_department
## 1 26 la union valle la union sucre
## 2 163 providencia san andres providencia narino
## 3 250 san andres san andres san andres santander
## 4 386 candelaria valle candelaria atlantico
## 5 389 bolivar antioquia bolivar cauca
## 6 389 bolivar antioquia bolivar santander
## 7 561 restrepo valle restrepo meta
## 8 591 argelia valle argelia cauca
## 9 592 san pedro valle san pedro sucre
## 10 750 san pedro antioquia san pedro sucre
## 11 820 santuario antioquia santuario risaralda
## 12 956 bolivar valle bolivar cauca
## 13 979 san andres antioquia san andres santander
## 14 985 la victoria valle la victoria boyaca
# Remove these dubious links from the remaining candidates
dubious_idx <- which(remaining_candidates$municipality_dist == 0 & remaining_candidates$department_dist > 0)
dubious_IDs <- remaining_candidates$dengue_ID[dubious_idx]
remaining_candidates <- remaining_candidates[!remaining_candidates$dengue_ID %in% dubious_IDs, ]
remaining_dengue_IDs <- setdiff(remaining_dengue_IDs, dubious_IDs)
# Clean up temporary variables
rm(remaining_dept_mismatch, remaining_summary, dubious_idx, dubious_IDs)
In this step, we inspect cases where the department matches perfectly but the municipality name differs. There are 41 such cases in our dataset.
Because automated string matching cannot always resolve these discrepancies, we visually inspected the geographic polygons to decide whether the match seemed reasonable (partial overlap or similar shape) or should be discarded. This classification was performed manually and is, of course, subjective. The goal here was not to achieve perfect matches, but to establish robust links between dengue incidence data and spatiotemporal data from Siraj et al. (2018).
# STEP 3: Perfect Department Match, Municipality Mismatch ----
# Identify dengue municipalities with matching departments but differing municipality names
sum(remaining_candidates$department_dist == 0 &
remaining_candidates$municipality_dist > 0)
## [1] 41
# Build a table of all candidate pairs (preserving duplicates)
for (dengue_id in remaining_candidates$dengue_ID[
remaining_candidates$department_dist == 0
]) {
candidate_rows <- remaining_candidates[
remaining_candidates$dengue_ID == dengue_id,
]
iter_entry <- data.frame(
dengue_ID = dengue_id,
ocha_ID = unique(candidate_rows$ADM2_PCODE),
muni_OCHA = unique(candidate_rows$adm2_std),
muni_OD = dengue_munic_link$adm2_std[
dengue_munic_link$dengue_ID == dengue_id
],
dept_OCHA = unique(candidate_rows$adm1_std),
dept_OD = dengue_munic_link$adm1_std[
dengue_munic_link$dengue_ID == dengue_id
]
)
if (!exists("mismatch_summary")) {
mismatch_summary <- iter_entry
} else {
mismatch_summary <- rbind(mismatch_summary, iter_entry)
}
}
# Manually verified classifications based on visual inspection ----
badfit_IDs <- c(2, 3, 11, 13, 14, 19:23, 32, 35:43, 49)
goodfit_IDs <- setdiff(seq_len(nrow(mismatch_summary)), badfit_IDs)
# Visualize each candidate match ----
for (i in seq_len(nrow(mismatch_summary))) {
od_gaul_code <- dengue_munic_link$FAO_GAUL_code[
dengue_munic_link$dengue_ID == mismatch_summary$dengue_ID[i]
]
ocha_code <- mismatch_summary$ocha_ID[i]
g <- plot_gaul_byIDs(
od_gaul_code = od_gaul_code,
ocha_munic_id = ocha_code,
od_shape = openDengue_shape,
ocha_shape = ocha_shape
)
g <- g + ggtitle(
paste(i, ifelse(i %in% badfit_IDs, " - bad", " - good"))
)
print(g)
}
# Add "good" matches to final link table ----
final_link_perfect <- rbind(
final_link_perfect,
remaining_candidates[
remaining_candidates$dengue_ID %in%
mismatch_summary$dengue_ID[goodfit_IDs],
]
)
# Remove processed entries from remaining_candidates ----
remaining_candidates <- remaining_candidates[
!remaining_candidates$dengue_ID %in% mismatch_summary$dengue_ID,
]
remaining_dengue_IDs <- setdiff(
remaining_dengue_IDs,
mismatch_summary$dengue_ID
)
# Inspect remaining match distances
table(
remaining_candidates$department_dist,
remaining_candidates$municipality_dist
)
##
## 2 3 4 7 8 9 10 11 12 13
## 3 2 1 0 0 0 0 0 0 0 0
## 4 3 1 0 0 0 2 0 0 0 1
## 5 0 0 1 1 2 0 0 3 0 1
## 6 5 1 2 0 1 1 0 1 1 0
## 7 0 1 1 0 0 2 0 0 0 0
## 8 0 0 1 0 0 0 1 0 0 0
## 9 0 0 1 0 0 0 0 0 1 0
## 15 0 0 0 0 0 0 0 1 0 0
# Clean up temporary objects
rm(dengue_id, candidate_rows, iter_entry, mismatch_summary,
badfit_IDs, goodfit_IDs, od_gaul_code, ocha_code, g)
After the previous matching steps, a small number of municipalities remain unmatched. In this step, we manually inspected the standardized department and municipality names from both datasets to check for possible correspondences. Since none showed strong evidence of being valid matches, these entries were excluded from the final linkage table.
# STEP 4: Manual inspection of remaining unmatched municipalities ----
for (dengue_id in remaining_candidates$dengue_ID) {
# Extract all OCHA candidates for this dengue municipality
ocha_candidates <- remaining_candidates[
remaining_candidates$dengue_ID == dengue_id,
c("adm1_std", "adm2_std")
]
ocha_candidates$geometry <- NULL
colnames(ocha_candidates) <- c("dept_OCHA", "muni_OCHA")
# Extract dengue municipality names
dengue_entry <- dengue_munic_link[
dengue_munic_link$dengue_ID == dengue_id,
c("adm1_std", "adm2_std")
]
colnames(dengue_entry) <- c("dept_OD", "muni_OD")
if (!exists("remaining_df")) {
remaining_df <- data.frame(dengue_id = dengue_id,
dept_OD = dengue_entry$dept_OD,
muni_OD = dengue_entry$muni_OD,
dept_OCH = ocha_candidates$dept_OCHA,
muni_OCH = ocha_candidates$muni_OCHA)
} else {
remaining_df <- rbind(remaining_df,
data.frame(dengue_id = dengue_id,
dept_OD = dengue_entry$dept_OD,
muni_OD = dengue_entry$muni_OD,
dept_OCH = ocha_candidates$dept_OCHA,
muni_OCH = ocha_candidates$muni_OCHA))
}
}
print(remaining_df)
## dengue_id dept_OD muni_OD dept_OCH
## 1 41 tolima mariquita bolivar
## 2 42 bolivar cartagena arauca
## 3 94 magdalena ariguani (el dificil) cundinamarca
## 4 139 norte santander cucuta norte de santander
## 5 204 putumayo san miguel (la dorada) boyaca
## 6 208 sucre tolu boyaca
## 7 208 sucre tolu boyaca
## 8 208 sucre tolu boyaca
## 9 208 sucre tolu boyaca
## 10 208 sucre tolu boyaca
## 11 208 sucre tolu boyaca
## 12 208 sucre tolu boyaca
## 13 208 sucre tolu boyaca
## 14 208 sucre tolu boyaca
## 15 217 narino tumaco bolivar
## 16 228 cauca patia (el bordo) boyaca
## 17 228 cauca patia (el bordo) cesar
## 18 228 cauca patia (el bordo) boyaca
## 19 228 cauca patia (el bordo) cesar
## 20 230 sucre galeras (nueva granada) magdalena
## 21 274 narino colon (genova) quindio
## 22 322 sucre coloso (ricaurte) narino
## 23 385 antioquia yondo (casabe) tolima
## 24 399 narino arboleda (berruecos) norte de santander
## 25 409 guainia puerto inirida amazonas
## 26 409 guainia puerto inirida amazonas
## 27 409 guainia puerto inirida amazonas
## 28 409 guainia puerto inirida amazonas
## 29 428 sucre toluviejo bolivar
## 30 444 valle buga boyaca
## 31 449 valle darien huila
## 32 454 huila isnos (san jose de isnos) boyaca
## 33 478 antioquia san vicente bolivar
## 34 514 bolivar tiquisio (puerto rico) meta
## 35 704 cordoba purisima antioquia
## 36 718 bogota bogota boyaca
## 37 850 narino alban (san jose) caldas
## 38 858 narino mallama (piedrancha) amazonas
## 39 859 cauca piendamo tolima
## 40 880 choco bojaya (bellavista) boyaca
## 41 880 choco bojaya (bellavista) cordoba
## 42 880 choco bojaya (bellavista) sucre
## 43 880 choco bojaya (bellavista) boyaca
## 44 880 choco bojaya (bellavista) cordoba
## 45 880 choco bojaya (bellavista) sucre
## 46 880 choco bojaya (bellavista) boyaca
## 47 880 choco bojaya (bellavista) cordoba
## 48 880 choco bojaya (bellavista) sucre
## 49 883 guainia pana pana (campo alegre) huila
## 50 965 narino magui (payan) guainia
## 51 967 narino santa barbara (iscuande) magdalena
## 52 976 cauca sotara boyaca
## 53 976 cauca sotara boyaca
## 54 976 cauca sotara boyaca
## 55 976 cauca sotara boyaca
## 56 976 cauca sotara boyaca
## 57 976 cauca sotara boyaca
## 58 976 cauca sotara boyaca
## 59 976 cauca sotara boyaca
## 60 976 cauca sotara boyaca
## 61 1006 choco belen de bajira risaralda
## muni_OCH
## 1 margarita
## 2 saravena
## 3 agua de dios
## 4 cacota
## 5 san miguel de sema
## 6 toca
## 7 togui
## 8 tota
## 9 toca
## 10 togui
## 11 tota
## 12 toca
## 13 togui
## 14 tota
## 15 turbaco
## 16 paz de rio
## 17 rio de oro
## 18 paz de rio
## 19 rio de oro
## 20 nueva granada
## 21 genova
## 22 ricaurte
## 23 santa isabel
## 24 arboledas
## 25 puerto arica
## 26 puerto narino
## 27 puerto arica
## 28 puerto narino
## 29 rio viejo
## 30 tuta
## 31 garzon
## 32 san jose de pare
## 33 san jacinto
## 34 puerto rico
## 35 buritica
## 36 socota
## 37 san jose
## 38 la pedrera
## 39 piedras
## 40 buenavista
## 41 buenavista
## 42 buenavista
## 43 buenavista
## 44 buenavista
## 45 buenavista
## 46 buenavista
## 47 buenavista
## 48 buenavista
## 49 campoalegre
## 50 mapiripana
## 51 santa barbara de pinto
## 52 soata
## 53 sora
## 54 soraca
## 55 soata
## 56 sora
## 57 soraca
## 58 soata
## 59 sora
## 60 soraca
## 61 belen de umbria
# No obvious good fits identified — linkage process completed.
# Remaining entries are excluded from the final linkage dataset.
rm(dengue_id, ocha_candidates, dengue_entry)
After completing the three matching steps, 1009 out of 1063 municipalities (≈95%) were successfully linked between the OpenDengue (Clarke et al. 2024) and OCHA shapefile identifiers. The remaining cases were manually reviewed but discarded due to ambiguous or inconsistent matches.
# Step 5 — Store final matched results --------------------------------------
# Quick preview of the final link table
head(final_link_perfect)
## Simple feature collection with 6 features and 19 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -75.38351 ymin: 3.745484 xmax: -71.94885 ymax: 10.79196
## Geodetic CRS: WGS 84
## Shape_Leng Shape_Area ADM2_ES ADM2_PCODE
## 294 2.0688161 0.077934078 El Carmen de Bolívar CO13244
## 532 2.2326910 0.092943392 Magangué CO13430
## 262 1.8718671 0.096242304 Cubará CO15223
## 745 1.4605427 0.032741165 Purificación CO73585
## 1058 0.5499045 0.008376978 Usiacurí CO08849
## 889 2.4343919 0.091328821 San Vicente de Chucurí CO68689
## ADM2_REF ADM2ALT1ES ADM2ALT2ES ADM1_ES ADM1_PCODE ADM0_ES
## 294 El Carmen de Bolivar <NA> <NA> Bolívar CO13 Colombia
## 532 Magangue <NA> <NA> Bolívar CO13 Colombia
## 262 Cubara <NA> <NA> Boyacá CO15 Colombia
## 745 Purificacion <NA> <NA> Tolima CO73 Colombia
## 1058 Usiacuri <NA> <NA> Atlántico CO08 Colombia
## 889 San Vicente de Chucuri <NA> <NA> Santander CO68 Colombia
## ADM0_PCODE date validOn validTo adm2_std adm1_std
## 294 CO 2018-01-01 2020-04-16 <NA> el carmen de bolivar bolivar
## 532 CO 2018-01-01 2020-04-16 <NA> magangue bolivar
## 262 CO 2018-01-01 2020-04-16 <NA> cubara boyaca
## 745 CO 2018-01-01 2020-04-16 <NA> purificacion tolima
## 1058 CO 2018-01-01 2020-04-16 <NA> usiacuri atlantico
## 889 CO 2018-01-01 2020-04-16 <NA> san vicente de chucuri santander
## municipality_dist department_dist dengue_ID geometry
## 294 0 0 1 MULTIPOLYGON (((-75.3143 9....
## 532 0 0 2 MULTIPOLYGON (((-74.73933 9...
## 262 0 0 3 MULTIPOLYGON (((-72.17368 7...
## 745 0 0 4 MULTIPOLYGON (((-74.86873 3...
## 1058 0 0 5 MULTIPOLYGON (((-74.97205 1...
## 889 0 0 6 MULTIPOLYGON (((-73.52601 7...
# Proportion of successfully matched municipalities
n_matched <- nrow(final_link_perfect)
n_total <- nrow(dengue_munic_link)
cat("Matched municipalities:", n_matched, "of", n_total,
"(", round(100 * n_matched / n_total, 1), "%)\n")
## Matched municipalities: 1009 of 1063 ( 94.9 %)
# Keep only linkage-relevant columns and drop geometry if present
final_link_clean <- final_link_perfect %>%
st_drop_geometry() %>%
select(dengue_ID, ADM2_PCODE) %>%
rename(ocha_ID = ADM2_PCODE)
# Sanity checks for duplicates
cat("Duplicated dengue_IDs:", any(duplicated(final_link_clean$dengue_ID)), "\n")
## Duplicated dengue_IDs: FALSE
cat("Duplicated ocha_IDs:", any(duplicated(final_link_clean$ocha_ID)), "\n")
## Duplicated ocha_IDs: FALSE
# Merge back with dengue metadata for reference
linked_results <- dengue_munic_link %>%
inner_join(final_link_clean, by = "dengue_ID")
# Save the linkage table
write.table(
linked_results,
file = "../data/empirical/IDLinks.txt",
col.names = TRUE,
row.names = FALSE,
quote = FALSE,
sep = "\t"
)
cat("Linkage table saved as 'IDLinks.txt'\n")
## Linkage table saved as 'IDLinks.txt'